# Usual packages
import os
import json
import requests
import datetime
import time
import joblib
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from pandas.io.json import json_normalize
from itertools import chain
# graph related packages
import cufflinks as cf
import plotly.offline
import matplotlib.pyplot as plt
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from PIL import Image
from os import path
# text related packages
import spacy
# Load the language model
nlp = spacy.load('en_core_web_lg')
from spacy.lang.en.stop_words import STOP_WORDS
import textblob
from textblob import TextBlob
import re
# need for topic modelling
from sklearn import metrics
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
headers = {
'Referer': 'https://www.rottentomatoes.com/m/the_lion_king_2019/reviews?type=user',\
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/74.0.3729.108 Safari/537.36',\
'X-Requested-With': 'XMLHttpRequest',\
}
url = 'https://www.rottentomatoes.com/napi/movie/9057c2cf-7cab-317f-876f-e50b245ca76e/reviews/user'
payload = {
'direction': 'next',
'endCursor': '',
'startCursor': ''
}
### web scrapping Method
parent_path =os.getcwd()
## Storing all the json data in movie_rdata dir
reviews_path = parent_path +"\\movie_rdata"
os.chdir(reviews_path)
s = requests.Session()
i=0
while (i < 302):
time.sleep(5)
data=''
r=''
print(payload,"i=",i)
r = s.get(url, headers=headers, params=payload) # GET Call
data = r.json()
#print(data)
if(data['pageInfo']['hasNextPage']):
next_endCursor=data['pageInfo']['endCursor']
payload = {
'direction': 'next',
'endCursor': next_endCursor,
'startCursor': ''
}
filename="page"+str(i)+".json"
with open(filename, 'w') as json_file:
json.dump(data, json_file)
i=i+1
json_file.close()
# Process data from the collected Dir and make a df
print(reviews_path)
total_files_processed=0
for review_file in os.listdir(reviews_path):
if review_file.endswith(".json"):
with open(review_file) as infile:
#print(review_file)
jdata=json.load(infile)
if(total_files_processed>0):
movie_df=movie_df.append(json_normalize(jdata['reviews']))
#print("coming here",total_files_processed)
else:
movie_df=json_normalize(jdata['reviews'])
#print("first time",total_files_processed)
total_files_processed=total_files_processed+1
print("Total files processed=",total_files_processed)
os.chdir(parent_path)
# resetting index as each page index it as 0-9 from json collection method
movie_df=movie_df.reset_index(drop=True)
movie_df.head(12)
movie_df.head()
#### Rating score >3 is considered positive (0) and negative (1) otherwise
movie_df['targetSentiment']=[0 if x>3 else 1 for x in movie_df['score']]
pd.DataFrame(movie_df,columns=['targetSentiment','score']).head()
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print(movie_df.info())
print("---+++----+++------+++----+++---SHAPE---+++----+++------+++----+++------+++----+++---+++")
print(movie_df.shape)
print("---+++----+++------+++----+++---COLUMNS---+++----+++------+++----+++----+++------+++----+++")
print(movie_df.columns)
movie_df['createDate']=pd.to_datetime(movie_df['createDate'])
movie_df['updateDate']=pd.to_datetime(movie_df['updateDate'])
movie_df.info()
movie_df.isnull().sum()
print("---+++----+++------+++----+++---Null Inference---+++----+++------+++----+++------+++----+++---+++")
print("Accounlink/displayName/ImageUrl/AccountLink is having NULL values, which can be expected.")
HAPPY=0
SAD=1
movie_df.groupby('targetSentiment').score.count().iplot(
kind='bar',
barmode='group',
title='Sentiment - Positive vs Negative ',
linecolor='black',
xTitle='Sentiment',
yTitle='# Reviewers'
)
pos_users=movie_df.groupby('targetSentiment').score.count()[0]
neg_users=movie_df.groupby('targetSentiment').score.count()[1]
pos_neg_split=np.round((neg_users/(neg_users+pos_users)),4)*100
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("% Of people who did not like this movie",pos_neg_split)
movie_df.score.value_counts().iplot(
kind='bar',
xTitle='rating',
linecolor='black',
yTitle='# Reviewers',
title="Reviewer's Rating Distribution")
verifiedr_ratings = movie_df[movie_df['isVerified']==1]['targetSentiment'].value_counts()
not_verifiedr_ratings = movie_df[movie_df['isVerified']==0]['targetSentiment'].value_counts()
superr_ratings = movie_df[movie_df['isSuperReviewer']==1]['targetSentiment'].value_counts()
not_superr_ratings = movie_df[movie_df['isSuperReviewer']==0]['targetSentiment'].value_counts()
df1 = pd.DataFrame([verifiedr_ratings,not_verifiedr_ratings])
df2 = pd.DataFrame([superr_ratings,not_superr_ratings])
df1.index = ['Verified User','not Verified User']
df2.index = ['Super User','not Super User']
df1.iplot(kind='bar',barmode='stack',title='Ratings by Verified User Type')
df2.iplot(kind='bar',barmode='stack',title='Ratings by Super User Type')
movie_trend=movie_df.copy(deep=True)
movie_trend.index = movie_trend['updateDate']
movie_trend.resample('D').mean().score.iplot(
kind='bar',
bins=50,
xTitle='Date',
linecolor='black',
yTitle='mean(Rating)',
title='Daywise Rating Distribution')
movie_trend.resample('D').count().score.iplot(
kind='bar',
bins=50,
xTitle='Date',
linecolor='black',
yTitle='# Ratings',
title='Rating Traffic Trend')
print(movie_df.hasSpoilers.value_counts())
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("\n% Of Users using Spoilers",round(movie_df.hasSpoilers.value_counts()[1]/(movie_df.hasSpoilers.value_counts()[0] +\
movie_df.hasSpoilers.value_counts()[1]),4))
print(movie_df.hasProfanity.value_counts())
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("\n% Of Users using Profanity",round(movie_df.hasProfanity.value_counts()[1]/(movie_df.hasProfanity.value_counts()[0] +\
movie_df.hasProfanity.value_counts()[1]),4))
movie_trend[movie_trend['hasProfanity']==True].score.sort_index(axis=0).iplot(
kind='bar',
xTitle='Date',
linecolor='black',
yTitle='Ratings',
title='Profanity vs Rating')
movie_trend[movie_trend['hasProfanity']==True].score.sort_index(axis=0).value_counts().iplot(
kind='bar',
xTitle='rating',
linecolor='black',
yTitle='count',
title='#Rating Distribution For Profanity reviews')
perc_prof_reviews=round(movie_df.hasProfanity.value_counts()[1]/(movie_df.hasProfanity.value_counts()[0] +\
movie_df.hasProfanity.value_counts()[1]),4)
perc_prof_reviews=round((movie_df.hasProfanity.value_counts()[1]/movie_df['user.userId'].count())*100,2)
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("% of Users using profanity :"+ str(perc_prof_reviews)+"\nThis is very less. Can be ignored.")
total_data=movie_df.copy(deep=True)
total_data.columns.values
columns_except_review_score=['createDate', 'displayImageUrl', 'displayName', 'hasProfanity',
'hasSpoilers', 'isSuperReviewer', 'isVerified', 'rating','timeFromCreation', 'updateDate', 'user.accountLink',
'user.displayName', 'user.realm', 'user.userId']
total_data.drop(columns_except_review_score,inplace=True,axis=1)
total_data.head()
total_data['word_count'] = total_data['review'].apply(lambda x: len(str(x).split()))
total_data['word_count'].iplot(
kind='violin',
xTitle='review length',
linecolor='black',
yTitle='count',
title='Word count distribution before cleaning text',colors='#604d9e')
total_data['review_len'] = total_data['review'].astype(str).apply(len)
total_data['review_len'].iplot(
kind='violin',
xTitle='review length',
linecolor='black',
yTitle='count',
title='Review Length Distribution before cleaning text')
print("---+++----+++------+++----+++---Avg(ReviewLen)/Avg(#Words)---+++----+++------+++----+++------+++----+++---+++")
print("\nReview Length is greater than #words in it by factor of:",\
round((total_data['review_len'].sum()/total_data['word_count'].sum()),2))
print("---+++----+++------+++----+++---#Word>700---+++----+++------+++----+++------+++----+++---+++")
print(total_data[total_data['word_count']>700])
print("---+++----+++------+++----+++---ReviewLength>4000---+++----+++------+++----+++------+++----+++---+++")
print(total_data[total_data['review_len']>4000])
print(movie_df.iloc[1854])
print("---+++----+++------+++----+++---INFERENCE---+++----+++------+++----+++------+++----+++---+++")
print("\nHe liked the movie, so wrote lengthy summary")
print(movie_df.iloc[1953])
print("---+++----+++------+++----+++---INFERENCE---+++----+++------+++----+++------+++----+++---+++")
print("\nHe dis-liked the movie, so wrote lengthy summary with some spoiler movie events!")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
def get_top_n_unigram(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_unigram(total_data['review'], 45)
df_uni = pd.DataFrame(common_words, columns = ['unigrams' , 'count'])
df_uni.groupby('unigrams').sum()['count'].\
sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
title='Top 45 Unigrams in review before cleaning')
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(total_data['review'], 45)
df_bi = pd.DataFrame(common_words, columns = ['bigrams' , 'count'])
df_bi.groupby('bigrams').sum()['count'].\
sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
title='Top 45 Bigrams in review before cleaning')
def get_top_n_trigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_trigram(total_data['review'], 45)
df_tri = pd.DataFrame(common_words, columns = ['trigrams' , 'count'])
df_tri.groupby('trigrams').sum()['count'].\
sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
title='Top 45 Trigrams in review before cleaning')
#print(list(STOP_WORDS))
from itertools import compress
from contractions import CONTRACTION_MAP
import unicodedata
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
#print(contraction)
#print(type(contraction))
match = contraction.group(0)
#print(match)
first_char = match[0]
#print(first_char)
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
#print(f"The expand match is {expand_match} and text is {text}")
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
def normalize_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#https://docs.python.org/2/library/unicodedata.html
return text
def add_NOT_after_not(sentence=None):
transformed = re.sub(r'\b(?:not|never|no)\b[\w\s]+[^\w\s]',
lambda match: re.sub(r'(\s+)(\w+)', r'\1NOT_\2', match.group(0)),
sentence,
flags=re.IGNORECASE)
return transformed
def tokenize_lemma_clean(dataframe,column_name):
# text cleaning and pre-processing
for index,row in dataframe.iterrows():
#print(row[column_name],":::",row['score'])
row[column_name]=row[column_name].lower()
doc=nlp(row[column_name])
# remove stop words
clean_tokens1 = [token for token in doc if not token.is_stop]
# may use STOP words for conjuction detection
#remove words with <=2 chars
clean_tokens1 = [token for token in clean_tokens1 if len(token.text)>2]
#remove non-alpha
clean_tokens2_bool = [token.is_alpha for token in clean_tokens1]
#clean_tokens2_bool = [token.is_alpha for token in doc]
clean_tokens2=list(compress(clean_tokens1,clean_tokens2_bool))
#not_string = re.compile("not_*{7,}")
#use only lemma
clean_tokens3 = [token.lemma_ for token in clean_tokens2]
#print(clean_tokens3)
clean_text=' '.join(clean_tokens3)
#dataframe.at[index,column_name]=clean_text
dataframe.at[index,"clean_text1"]=clean_text
def pos_to_keep(dataframe,column_name,dest_column_name,x1=None,x2=None,x3=None,x4=None):
# text cleaning and pre-processing
for index,row in dataframe.iterrows():
#print(row['STORY'],":::",row['SECTION'])
doc=nlp(row[column_name])
# remove stop words
nn_tokens1 = [token.text for token in doc if ((token.text in ['like','love'])or(token.pos_ in [x1,x2,x3,x4]))]
nn_text=' '.join(nn_tokens1)
dataframe.at[index,dest_column_name]=nn_text
## keeping cleaned data in 'clean_text' column for safekeeping and comparing with original
total_data.insert(2,"clean_text","")
#total_data['clean_text'] = [TextBlob(text).correct() for text in total_data['review']]
total_data['clean_text'] = [expand_contractions(text) for text in total_data['review']]
#total_data['clean_text'] = [expand_contractions(text) for text in total_data['clean_text']]
total_data['clean_text'] = [normalize_accented_chars(text) for text in total_data['clean_text']]
total_data['clean_text'] = [add_NOT_after_not(text) for text in total_data['clean_text']]
## keeping the second stage of clean data in other column "clean_text1" for safekeeping and comparing with earlier clean data
## 'clean_text1' will have the final cleaned data after all these above process and lemmatization
tokenize_lemma_clean(total_data,"clean_text")
total_data.head(20)
total_data['word_count_clean'] = total_data['clean_text1'].apply(lambda x: len(str(x).split()))
total_data['word_count_clean'].iplot(
kind='violin',
xTitle='review length',
linecolor='black',
yTitle='count',
title='Word count Distribution after cleaning text',colors='#604d9e')
total_data['review_len'] = total_data['clean_text1'].astype(str).apply(len)
total_data['review_len'].iplot(
kind='violin',
xTitle='review length',
linecolor='black',
yTitle='count',
title='Review Length Distribution after cleaning text')
common_words = get_top_n_unigram(total_data['clean_text1'], 45)
#for word, freq in common_words:
# print(word, freq)
df_uni_clean = pd.DataFrame(common_words, columns = ['unigrams' , 'count'])
df_uni_clean.groupby('unigrams').sum()['count'].\
sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
title='Top 45 Unigrams in review after text cleaning')
common_words = get_top_n_bigram(total_data['clean_text1'], 45)
#for word, freq in common_words:
# print(word, freq)
df_bi_clean = pd.DataFrame(common_words, columns = ['bigrams' , 'count'])
df_bi_clean.groupby('bigrams').sum()['count'].\
sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
title='Top 45 Bigrams in review after text cleaning')
common_words = get_top_n_trigram(total_data['clean_text1'], 45)
#for word, freq in common_words:
# print(word, freq)
df_tri_clean = pd.DataFrame(common_words, columns = ['trigrams' , 'count'])
df_tri_clean.groupby('trigrams').sum()['count'].\
sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
title='Top 45 Trigrams in review after text cleaning')
all_words = ' '.join([text for text in total_data['clean_text1']])
lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
max_font_size=110,mask=lion_mask,contour_width=3, contour_color='steelblue').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
negative_sent_text=total_data[total_data['targetSentiment']==1]['clean_text1']
all_words = ' '.join([text for text in negative_sent_text])
lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
max_font_size=110,mask=lion_mask,contour_width=3, contour_color='orange').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
#!pip install textblob
from textblob import TextBlob
blob = TextBlob(str(total_data['clean_text1'].tolist()))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
#pos_df = pos_df[~pos_df['word'].str.isnumeric()]
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
kind='bar',
xTitle='POS',
yTitle='count',
title='Top 20 Part-of-speech tagging for review corpus')
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(total_data['clean_text1'])
tsne = TSNEVisualizer()
tsne.fit_transform(docs, total_data['targetSentiment'])
tsne.poof()
pos_to_keep(total_data,"clean_text1","clean_adj_adv_verb_col","ADJ","ADV","VERB")
total_data.head()
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(total_data['clean_adj_adv_verb_col'])
tsne = TSNEVisualizer()
tsne_results=tsne.fit_transform(docs, total_data['targetSentiment'])
tsne.poof()
pos_to_keep(total_data,"clean_text1","clean_adj_col","ADJ")
total_data.head()
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(total_data['clean_adj_col'])
tsne = TSNEVisualizer()
tsne.fit_transform(docs, total_data['targetSentiment'])
tsne.poof()
negative_sent_text_adj=total_data[total_data['targetSentiment']==1]['clean_adj_col']
all_words = ' '.join([text for text in negative_sent_text_adj])
lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
max_font_size=110,mask=lion_mask,contour_width=3, contour_color='orange').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
# love,like - keeping as default as its common and needs to be there
pos_sent_text_adj=total_data[total_data['targetSentiment']==0]['clean_adj_col']
all_words = ' '.join([text for text in pos_sent_text_adj])
lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
max_font_size=110,mask=lion_mask,contour_width=3, contour_color='steelblue').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
# love,like - keeping as default as its common and needs to be there
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=20000,
min_df=0.05,
use_idf=True, ngram_range=(1,4))
tfidf_matrix = tfidf_vectorizer.fit_transform(total_data['clean_text1'])
tfidf_vectorizer.get_feature_names()
n_components = 26
svd_model = TruncatedSVD(n_components=n_components, algorithm='randomized',n_iter=20,random_state=143)
svd_matrix = svd_model.fit(tfidf_matrix)
doc_topic_matrix = svd_matrix.transform(tfidf_matrix)
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print(f"tfidf_matrix.shape : {tfidf_matrix.shape}")
print(f"svd_matrix.n_components : {svd_matrix.n_components}")
print(f"\n\nExplained Variance Ratio : {svd_matrix.explained_variance_ratio_}")
print(f"\nTotal Explained Variance : {round(svd_matrix.explained_variance_ratio_.sum() * 100, 2)} %")
print(f"\nThe singular values are {svd_matrix.singular_values_}")
terms = tfidf_vectorizer.get_feature_names()
for i, comp in enumerate(svd_model.components_):
#print(f"The component is {comp} and shape is {comp.shape}")
terms_comp = zip(terms, comp)
sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:6]
print("Topic "+str(i)+": ")
for t in sorted_terms:
print(t[0],end=" ")
print("\n")
Sum_of_squared_distances = []
K = range(1,8)
for k in K:
kmeanModel = KMeans(n_clusters=k, random_state=143)
kmeanModel.fit(doc_topic_matrix)
Sum_of_squared_distances.append(kmeanModel.inertia_)
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 10}
plt.rc('font', **font)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
# Choosing the best possible elbow point
num_clusters = 4
km = KMeans(n_clusters=num_clusters)
km.fit(doc_topic_matrix)
clusters = km.labels_.tolist()
centers = km.cluster_centers_
print(f"the cluster centers are {centers}")
joblib.dump(km, 'doc_topic_cluster_best_K.pkl')
clusters = km.labels_.tolist()
total_data['doc_topic_cluster_group'] = clusters
total_data.head()
doc_cluster_df = pd.DataFrame(total_data)
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("No. of docs in each cluster id")
doc_cluster_df['doc_topic_cluster_group'].value_counts()
from itertools import chain
doc_cluster_df['tokenized_text'] = [text.split(' ') for text in doc_cluster_df['clean_text1']]
grouped_text = doc_cluster_df.groupby('doc_topic_cluster_group')['tokenized_text']
frequent_words_df = pd.DataFrame(columns={"values", "counts", "cluster_id"})
for num in range(num_clusters):
values, counts = np.unique(list(chain.from_iterable(grouped_text.get_group(num))), return_counts=True)
sorted_indices = np.argsort(-counts)
frequent_words_df = frequent_words_df.append({"values":values[sorted_indices], "counts":counts[sorted_indices], "cluster_id": num}, ignore_index=True)
frequent_words_df.head()
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 35}
plt.rc('font', **font)
fig = plt.figure(figsize=(20,50))
plt.subplot(2,2,1)
plt.xlabel("Topic 0",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[0,'values'][:20], frequent_words_df.loc[0,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(2,2,2)
plt.xlabel("Topic 1",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[1,'values'][:20], frequent_words_df.loc[1,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(2,2,3)
plt.xlabel("Topic 2",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[2,'values'][:20], frequent_words_df.loc[2,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(2,2,4)
plt.xlabel("Topic 3",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[3,'values'][:20], frequent_words_df.loc[3,'counts'][:20])
plt.gca().invert_yaxis()
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=20000,
min_df=0.05,
use_idf=True, ngram_range=(1,4))
neg_senti_df=total_data[total_data['targetSentiment']==1].copy(deep=True)
tfidf_matrix = tfidf_vectorizer.fit_transform(neg_senti_df['clean_text1'])
tfidf_vectorizer.get_feature_names()
n_components = 26
svd_model = TruncatedSVD(n_components=n_components, algorithm='randomized',n_iter=20,random_state=143)
svd_matrix = svd_model.fit(tfidf_matrix)
doc_topic_matrix = svd_matrix.transform(tfidf_matrix)
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print(f"tfidf_matrix.shape : {tfidf_matrix.shape}")
print(f"svd_matrix.n_components : {svd_matrix.n_components}")
print(f"\n\nExplained Variance Ratio : {svd_matrix.explained_variance_ratio_}")
print(f"\nTotal Explained Variance : {round(svd_matrix.explained_variance_ratio_.sum() * 100, 2)} %")
print(f"\nThe singular values are {svd_matrix.singular_values_}")
terms = tfidf_vectorizer.get_feature_names()
for i, comp in enumerate(svd_model.components_):
#print(f"The component is {comp} and shape is {comp.shape}")
terms_comp = zip(terms, comp)
sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:6]
print("Topic "+str(i)+": ")
for t in sorted_terms:
print(t[0],end=" ")
print("\n")
Sum_of_squared_distances = []
K = range(1,8)
for k in K:
kmeanModel = KMeans(n_clusters=k, random_state=143)
kmeanModel.fit(doc_topic_matrix)
Sum_of_squared_distances.append(kmeanModel.inertia_)
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 10}
plt.rc('font', **font)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
# Choosing the best possible elbow point
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(doc_topic_matrix)
clusters = km.labels_.tolist()
centers = km.cluster_centers_
print(f"the cluster centers are {centers}")
joblib.dump(km, 'doc_topic_cluster_best_K1.pkl')
clusters = km.labels_.tolist()
neg_senti_df['doc_topic_cluster_group'] = clusters
doc_cluster_df = pd.DataFrame(neg_senti_df)
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("No. of docs in each cluster id")
doc_cluster_df['doc_topic_cluster_group'].value_counts()
doc_cluster_df['tokenized_text'] = [text.split(' ') for text in doc_cluster_df['clean_text1']]
grouped_text = doc_cluster_df.groupby('doc_topic_cluster_group')['tokenized_text']
frequent_words_df = pd.DataFrame(columns={"values", "counts", "cluster_id"})
for num in range(num_clusters):
values, counts = np.unique(list(chain.from_iterable(grouped_text.get_group(num))), return_counts=True)
sorted_indices = np.argsort(-counts)
frequent_words_df = frequent_words_df.append({"values":values[sorted_indices], "counts":counts[sorted_indices], "cluster_id": num}, ignore_index=True)
frequent_words_df.head()
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 35}
plt.rc('font', **font)
fig = plt.figure(figsize=(20,50))
plt.subplot(3,2,1)
plt.xlabel("Topic 0",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[0,'values'][:20], frequent_words_df.loc[0,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(3,2,2)
plt.xlabel("Topic 1",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[1,'values'][:20], frequent_words_df.loc[1,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(3,2,3)
plt.xlabel("Topic 2",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[2,'values'][:20], frequent_words_df.loc[2,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(3,2,4)
plt.xlabel("Topic 3",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[3,'values'][:20], frequent_words_df.loc[3,'counts'][:20])
plt.gca().invert_yaxis()
plt.subplot(3,2,5)
plt.xlabel("Topic 4",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[4,'values'][:20], frequent_words_df.loc[4,'counts'][:20])
plt.gca().invert_yaxis()
def review_polarity(dataframe,column_name,dest_column_name):
for index,row in dataframe.iterrows():
review=TextBlob(row[column_name])
dataframe.at[index,dest_column_name]=review.polarity
review_polarity(total_data,"clean_text1","rev_polarity")
total_data.head()
sent_polarity_df=pd.DataFrame(total_data,columns=['targetSentiment','rev_polarity'])
sent_polarity_df[sent_polarity_df['targetSentiment']==0]['rev_polarity'].iplot(
kind='hist',
xTitle='rating',
linecolor='black',
yTitle='count',
color="blue",
title='Polarity distribution of Positive reviews')
sent_polarity_df[sent_polarity_df['targetSentiment']==1]['rev_polarity'].iplot(
kind='hist',
xTitle='rating',
linecolor='black',
yTitle='count',
color="orange",
title='Polarity distribution of Negative reviews')
final_data=total_data.copy(deep=True)
final_data.columns.values